#Required libraries

# Tidyverse for data science and exploration
require(dplyr)
Loading required package: dplyr

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union
require(tidyr)
Loading required package: tidyr
require(readr)
Loading required package: readr
require(tibble)
Loading required package: tibble
require(stringr)
Loading required package: stringr
require(purrr)
Loading required package: purrr
require(forcats)
Loading required package: forcats
require(rlang)
Loading required package: rlang

Attaching package: ‘rlang’

The following objects are masked from ‘package:purrr’:

    %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl, flatten_raw, invoke, list_along,
    modify, prepend, splice
# enhances tidyverse
require(tidylog) # additional logging
Loading required package: tidylog

Attaching package: ‘tidylog’

The following objects are masked from ‘package:tidyr’:

    drop_na, fill, gather, pivot_longer, pivot_wider, replace_na, spread, uncount

The following objects are masked from ‘package:dplyr’:

    add_count, add_tally, anti_join, count, distinct, distinct_all, distinct_at, distinct_if, filter, filter_all,
    filter_at, filter_if, full_join, group_by, group_by_all, group_by_at, group_by_if, inner_join, left_join, mutate,
    mutate_all, mutate_at, mutate_if, relocate, rename, rename_all, rename_at, rename_if, rename_with, right_join,
    sample_frac, sample_n, select, select_all, select_at, select_if, semi_join, slice, slice_head, slice_max,
    slice_min, slice_sample, slice_tail, summarise, summarise_all, summarise_at, summarise_if, summarize,
    summarize_all, summarize_at, summarize_if, tally, top_frac, top_n, transmute, transmute_all, transmute_at,
    transmute_if, ungroup

The following object is masked from ‘package:stats’:

    filter
require(magrittr) # additional data pipe syntax
Loading required package: magrittr

Attaching package: ‘magrittr’

The following object is masked from ‘package:rlang’:

    set_names

The following object is masked from ‘package:purrr’:

    set_names

The following object is masked from ‘package:tidyr’:

    extract
# for reading data in multiple formats
require(readxl)
Loading required package: readxl
require(haven)
Loading required package: haven
# visual analysis
require(ggplot2)
Loading required package: ggplot2
Want to understand how all the pieces fit together? Read R for Data Science: https://r4ds.had.co.nz/
require(GGally) # extensions to ggplot
Loading required package: GGally
Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2
require(gt) # well formatted tables
Loading required package: gt
# client-side interactive publishable graphics
require(plotly)
Loading required package: plotly
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio

Attaching package: ‘plotly’

The following object is masked from ‘package:ggplot2’:

    last_plot

The following objects are masked from ‘package:tidylog’:

    distinct, filter, group_by, mutate, rename, select, slice, summarise, transmute, ungroup

The following object is masked from ‘package:stats’:

    filter

The following object is masked from ‘package:graphics’:

    layout
require(leaflet)
Loading required package: leaflet
require(crosstalk)
Loading required package: crosstalk
require(htmlwidgets)
Loading required package: htmlwidgets
# server-side interactive graphics
require(shiny)
Loading required package: shiny

Attaching package: ‘shiny’

The following object is masked from ‘package:crosstalk’:

    getDefaultReactiveDomain
require(shinyjs)
Loading required package: shinyjs
Find out advanced usage of shinyjs:
    https://deanattali.com/shinyjs/advanced

Attaching package: ‘shinyjs’

The following object is masked from ‘package:shiny’:

    runExample

The following object is masked from ‘package:gt’:

    html

The following objects are masked from ‘package:methods’:

    removeClass, show
# Canned Interactive EDA 
require(ExPanDaR)
Loading required package: ExPanDaR

Exploring KU Book Processing Charges

# read KU data frame
KUbpc.df <- read_csv("Public Data/openapc-de/data/bpc.csv")
Parsed with column specification:
cols(
  institution = col_character(),
  period = col_double(),
  euro = col_double(),
  doi = col_character(),
  backlist_oa = col_logical(),
  publisher = col_character(),
  book_title = col_character(),
  isbn = col_character(),
  isbn_print = col_character(),
  isbn_electronic = col_character(),
  license_ref = col_character(),
  indexed_in_crossref = col_logical(),
  doab = col_logical()
)
# read DOAB metadata

source('Public Data/DOAB/doabingest.R')
DOABmeta.df <- doabFetch()
embedded nul(s) found in input


head(KUbpc.df)
head(summary(KUbpc.df))
 institution            period          euro          doi            backlist_oa      publisher          book_title       
 Length:938         Min.   :2017   Min.   :1075   Length:938         Mode :logical   Length:938         Length:938        
 Class :character   1st Qu.:2017   1st Qu.:1875   Class :character   FALSE:357       Class :character   Class :character  
 Mode  :character   Median :2018   Median :1981   Mode  :character   TRUE :581       Mode  :character   Mode  :character  
                    Mean   :2018   Mean   :4368                                                                           
                    3rd Qu.:2019   3rd Qu.:8250                                                                           
                    Max.   :2020   Max.   :8978                                                                           
     isbn            isbn_print        isbn_electronic    license_ref        indexed_in_crossref    doab        
 Length:938         Length:938         Length:938         Length:938         Mode :logical       Mode :logical  
 Class :character   Class :character   Class :character   Class :character   FALSE:127           FALSE:44       
 Mode  :character   Mode  :character   Mode  :character   Mode  :character   TRUE :811           TRUE :894      
                                                                                                                
                                                                                                                
                                                                                                                
ggplot(data = KUbpc.df, aes(KUbpc.df$institution)) + geom_bar() 


ggplot(data = KUbpc.df, aes(KUbpc.df$euro)) + geom_histogram()

General Exploratory Data Analysis


ggplot(data = KUbpc.df) + geom_bar(mapping = aes(x = KUbpc.df$doab))


# Date to Doab
date_doab <- KUbpc.df %>% ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$period, colour = KUbpc.df$doab)) + geom_freqpoly(binwidth = 0.1)
ggplotly(date_doab)


# publisher_euro <- KUbpc.df %>% 
# ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$publisher, colour = KUbpc.df$euro)) + geom_freqpoly(binwidth = 0.1)

# Institution to Euro
institution_euro <- KUbpc.df %>% ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$euro)) + geom_freqpoly(mapping = aes(colour = KUbpc.df$institution), binwidth = 500)

ggplotly(institution_euro)

NA

Idea: Publishers vs. Charges

Question: How do the top 25% of publishers divide up charges (in Euro)?

Observation: Charges are grouped around ~2000 Euros and ~8000 Euros.


publisher_counts <- KUbpc.df %>%
    group_by(publisher) %>%
    tally
tally: now 110 rows and 2 columns, ungrouped
sorted_counts = arrange(publisher_counts, desc(n))

total_n = sum(sorted_counts$n)
quarter_n = 0.25 * total_n
new_n = sum(sorted_counts$n[0:6])

sorted_counts %>% filter(n > 24)

# filtered <- filter(KUbpc.df$publisher %in% sorted_counts$publisher)

filtered <- filter(KUbpc.df, KUbpc.df$publisher == 'transcript Verlag' |
                     KUbpc.df$publisher == 'Duke University Press' |
                     KUbpc.df$publisher == 'University of Michigan Press' |
                     KUbpc.df$publisher == 'Manchester University Press' |
                     KUbpc.df$publisher == 'Pluto Press' |
                     KUbpc.df$publisher == 'Liverpool University Press')

head(filtered)

euro_publisher <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$publisher, y = filtered$euro), 
         aes(x = filtered$publisher, y = filtered$euro)) + 
  # geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  geom_count(aes(color = ..n.., group = euro)) + 
  scale_size_area(max_size = 10) + 
  theme(axis.text = element_text(size = rel(0.75))) +
  labs(title = "How Publishers Divide Charges", x = "Top 25% of Publishers", y = "Price (Euro)", color = 'Number of Copies') +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))

# ggplot:
ggplotly(euro_publisher)
`group_by_()` is deprecated as of dplyr 0.7.0.
Please use `group_by()` instead.
See vignette('programming') for more help
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.

# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$publisher, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  labs(title = "How Publishers Divide Charges", x = "Top 25% of Publishers", y = "Price (Euro)", color = 'Number of Copies') +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))
cross_ft <- bscols(
  filter_select("publisher", "Select a publisher", ft, ~publisher),
  ggplotly(gg_ft, dynamicTicks = TRUE),
  widths = c(12, 12)
)
All elements of `...` must be named.
Did you want `key = c(key)`?Sum of bscol width units is greater than 12
bscols(cross_ft)


# shared_euro_publisher <- SharedData$new(filtered)
# leaflet(shared_euro_publisher) %>% addMarkers()
# data.table::data.table(shared_euro_publisher)

Idea: Publishers’ Charges vs. Year/OA Type

Sub-Question: What best explains the particular division of charges? (Year, OA Type)

Observation: The low and high charge groups seem to be defined by the type of OA business model, whereas the slight differences within each group seem to be defined by the year.


head(filtered)

# Does Type of OA impact the particular division of charges?

euro_oa_publisher <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$backlist_oa, y = filtered$euro), 
         aes(x = filtered$backlist_oa, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., group = euro)) + 
  scale_size_area(max_size = 10) + 
  theme(axis.text = element_text(size = rel(0.75))) +
  labs(title = "How OA Impacts Price Division of Charges", x = "Type of OA", y = "Price (Euro)", color = 'Number of Copies')

# ggplot:
ggplotly(euro_oa_publisher)


# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$backlist_oa, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  labs(title = "How OA Impacts Division of Charges", x = "Type of OA", y = "Price (Euro)", color = 'Number of Copies')
cross_ft <- bscols(
  filter_select("publisher", "Select a publisher", ft, ~publisher),
  ggplotly(gg_ft, dynamicTicks = TRUE),
  widths = c(12, 12)
)
All elements of `...` must be named.
Did you want `key = c(key)`?Sum of bscol width units is greater than 12
bscols(cross_ft)



# Does Year impact the particular division of charges?

euro_year_publisher <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$period, y = filtered$euro), 
         aes(x = filtered$period, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., group = euro)) + 
  scale_size_area(max_size = 10) + 
  theme(axis.text = element_text(size = rel(0.75))) +
  labs(title = "How Year Impacts Price Division of Charges", x = "Year", y = "Price (Euro)", color = 'Number of Copies')

# ggplot:
ggplotly(euro_year_publisher)


# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$period, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  labs(title = "How Year Impacts Division of Charges", x = "Year", y = "Price (Euro)", color = 'Number of Copies')
cross_ft <- bscols(
  filter_select("publisher", "Select a publisher", ft, ~publisher),
  ggplotly(gg_ft, dynamicTicks = TRUE),
  widths = c(12, 12)
)
All elements of `...` must be named.
Did you want `key = c(key)`?Sum of bscol width units is greater than 12
bscols(cross_ft)

NA
NA

Idea: Publishers vs. OA

Question: What type of business model do the top 25% publishers use?

Observation: Most have a higher proportion of True (moved to OA from traditional publishing) than False (already published OA).


oa_type <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$publisher, colour = filtered$backlist_oa), fill = filtered$backlist_oa) +
  geom_bar(position = "fill", width = 0.7, fill="#EAEAEA") +
  labs(title = "Business Model OA for Publishers", x = "Top 25% of Publishers", y = "Proportion", color = 'Types of OA') +
  theme(axis.text = element_text(size = rel(0.75))) +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17)) +
  scale_color_brewer(palette = "Set1")

ggplotly(oa_type)


# crosstalk:
ft <- highlight_key(filtered)
oa_ft <- ggplot(data = ft, mapping = aes(x = ft$publisher, colour = ft$backlist_oa), fill = ft$backlist_oa) +
  geom_bar(position = "fill", width = 0.7) +
  labs(title = "Business Model OA for Publishers", x = "Top 25% of Publishers", y = "Proportion of Backlist OA", color = 'Types of OA') +
  theme(axis.text = element_text(size = rel(0.75))) +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))
# cross_oa_ft <- bscols(
#   filter_select("publisher", "Select a publisher", ft, ~publisher),
#   ggplotly(oa_ft, dynamicTicks = TRUE),
#   # widths = c(12, 12)
# )

# bscols(cross_oa_ft)

Idea: Publishers’ OA vs. Year

Question: Did OA business models of the top 25% publishers change per year?

Observation:


oa_time <- function(pub_name) {
  pub_ft <- filter(filtered, filtered$publisher == pub_name)
  
  pub_oa <- pub_ft %>% 
    ggplot(data = pub_ft, mapping = aes(x = pub_ft$period, colour = pub_ft$backlist_oa), fill = pub_ft$backlist_oa) +
    geom_bar(position = "fill", width = 0.7, fill="#EAEAEA") +
    labs(title = paste(pub_name, "'s OA Through the Years", sep = ""), 
         x = "Years", y = "Proportion of Backlist OA", color = 'Types of OA') +
    theme(axis.text = element_text(size = rel(0.75))) +
    scale_x_discrete(limits=c(2017, 2018, 2019)) +
    scale_color_brewer(palette = "Set1")

  ggplotly(pub_oa)
  
}

top25_list = c("transcript Verlag", "Duke University Press", "University of Michigan Press", "Manchester University Press", "Pluto Press", "Liverpool University Press")

oa_time("transcript Verlag")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?

oa_time("Duke University Press")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?

oa_time("University of Michigan Press")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?

oa_time("Manchester University Press")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?

oa_time("Pluto Press")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?

oa_time("Liverpool University Press")
Continuous limits supplied to discrete scale.
Did you mean `limits = factor(...)` or `scale_*_continuous()`?

Idea: Revenue vs. OA

Question: What total revenue are publishers receiving each year?

Observation:


# Finding total revenue for each publisher

revenue_finder <- function(pub_name) {
  pub_filtered <- filter(filtered, filtered$publisher == pub_name)
  rev = sum(pub_filtered$euro)
}

revenue_df <- data.frame("publisher" = top25_list)
revenue_list <- c()

for (i in top25_list) {
  revenue_list<-c(revenue_list,revenue_finder(i))
}

revenue_df$revenue <- c(revenue_list)
print(revenue_df)

# ggplot:
publisher_revenue <- revenue_df %>%
  ggplot(data = revenue_df, mapping = aes(x = revenue_df$publisher, y = revenue_df$revenue), fill = revenue_df$revenue) +
  geom_col() +
  labs(title = "Total Revenue for Publishers", x = "Top 25% of Publishers", y = "Revenue (Euro)", color = 'Types of OA') +
  theme(axis.text = element_text(size = rel(0.75))) +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17)) +
  scale_fill_brewer(palette = "Set1")

ggplotly(publisher_revenue)
Use of `revenue_df$publisher` is discouraged. Use `publisher` instead.Use of `revenue_df$revenue` is discouraged. Use `revenue` instead.

Idea: Revenue vs. OA

Question: What revenue are publishers receiving per year?

Observation:


# Finding total revenue for each publisher

revlist_2017 <- c()
revlist_2018 <- c()
revlist_2019 <- c()

revlist <- c()

for (name in top25_list) {
  pub_name <- filter(filtered, filtered$publisher == name)
  rev_2017 = sum(pub_name[pub_name$period == 2017,]$euro)
  revlist_2017 <- c(revlist_2017, rev_2017)
  rev_2018 = sum(pub_name[pub_name$period == 2018,]$euro)
  revlist_2018 <- c(revlist_2018, rev_2018)
  rev_2019 = sum(pub_name[pub_name$period == 2019,]$euro)
  revlist_2019 <- c(revlist_2019, rev_2019)
}

revenue_df <- data.frame("publisher" = top25_list)
revenue_df$'2017' <- c(revlist_2017)
revenue_df$'2018' <- c(revlist_2018)
revenue_df$'2019' <- c(revlist_2019)

print(revenue_df)

revenue_year <- c(revenue_df$'2017', revenue_df$'2018', revenue_df$'2019')
year <- c('2017', '2018', '2019')

# ggplot:
pub_year_revenue1 <- revenue_df %>%
  
  ggplot(data = revenue_df, mapping = aes(x = '2017', y = revenue_df$'2017', fill = revenue_df$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_revenue1)
Use of `revenue_df$"2017"` is discouraged. Use `2017` instead.Use of `revenue_df$publisher` is discouraged. Use `publisher` instead.

pub_year_revenue2 <- revenue_df %>%
  
  ggplot(data = revenue_df, mapping = aes(x = '2018', y = revenue_df$'2018', fill = revenue_df$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_revenue2)
Use of `revenue_df$"2018"` is discouraged. Use `2018` instead.Use of `revenue_df$publisher` is discouraged. Use `publisher` instead.

pub_year_revenue3 <- revenue_df %>%
  
  ggplot(data = revenue_df, mapping = aes(x = '2019', y = revenue_df$'2019', fill = revenue_df$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_revenue3)
Use of `revenue_df$"2019"` is discouraged. Use `2019` instead.Use of `revenue_df$publisher` is discouraged. Use `publisher` instead.

Continued, tried putting it into one graph.


revlist <- c()
revlist_2017 <- c()
revlist_2018 <- c()
revlist_2019 <- c()

for (name in top25_list) {
  pub_name <- filter(filtered, filtered$publisher == name)
  rev_2017 = sum(pub_name[pub_name$period == 2017,]$euro)
  revlist_2017 <- c(revlist_2017, rev_2017)
  rev_2018 = sum(pub_name[pub_name$period == 2018,]$euro)
  revlist_2018 <- c(revlist_2018, rev_2018)
  rev_2019 = sum(pub_name[pub_name$period == 2019,]$euro)
  revlist_2019 <- c(revlist_2019, rev_2019)
}

revlist <- c(revlist_2017, revlist_2018, revlist_2019)

print(revlist)
 [1] 127420 153491  43044  96849  51824  48131  58125  61875  54750  36000  53625  21375  51750  58125  36000  17625  40500  42375
nrev <- matrix(revlist, ncol=6, byrow=TRUE)
colnames(nrev) <- top25_list
rownames(nrev) <- c("2017", "2018", "2019")
nrev <- as.table(nrev)
nrev <- as.data.frame.matrix(nrev)

print(nrev)

#, nrev$`Duke University Press`, nrev$`University of Michigan Press`, nrev$`Pluto Press`, nrev$`Manchester University Press`, nrev$`Liverpool University Press`

pub_year_rev <- nrev %>%
  
  ggplot(data = nrev, mapping = aes(x = c("2017", "2018", "2019"), y = c(nrev$"transcript Verlag"), fill = nrev$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_rev)
Use of `nrev$"transcript Verlag"` is discouraged. Use `transcript Verlag` instead.Use of `nrev$publisher` is discouraged. Use `publisher` instead.

Idea: DOAB analysis

Question: What is the average time gap between year of publication and added on date?

Observation:


DOABmeta.df <- filter(DOABmeta.df, is.na(DOABmeta.df$Year.of.publication))
print(DOABmeta.df$Year.of.publication[1:4])
[1] <NA> <NA> <NA> <NA>
694 Levels:  ...
gap = mean(DOABmeta.df$Added.on.date - DOABmeta.df$Year.of.publication[1:3])
‘-’ not meaningful for factors
print(gap)
[1] NA

Comparison of charges by year and backlist

Interactive charges exploration

### Interactive Dataset Exploration 
---
title: "Exploratory Analysis"
output: html_notebook
---

```{r}
#Required libraries

# Tidyverse for data science and exploration
require(dplyr)
require(tidyr)
require(readr)
require(tibble)
require(stringr)
require(purrr)
require(forcats)
require(rlang)

# enhances tidyverse
require(tidylog) # additional logging
require(magrittr) # additional data pipe syntax


# for reading data in multiple formats
require(readxl)
require(haven)

# visual analysis
require(ggplot2)
require(GGally) # extensions to ggplot
require(gt) # well formatted tables
# client-side interactive publishable graphics
require(plotly)
require(leaflet)
require(crosstalk)
require(htmlwidgets)
# server-side interactive graphics
require(shiny)
require(shinyjs)
# Canned Interactive EDA 
require(ExPanDaR)


```
## Exploring KU Book Processing Charges
```{r  }
# read KU data frame
KUbpc.df <- read_csv("Public Data/openapc-de/data/bpc.csv")
# read DOAB metadata

source('Public Data/DOAB/doabingest.R')
DOABmeta.df <- doabFetch()
```
```{r  }


head(KUbpc.df)
head(summary(KUbpc.df))

ggplot(data = KUbpc.df, aes(KUbpc.df$institution)) + geom_bar() 

ggplot(data = KUbpc.df, aes(KUbpc.df$euro)) + geom_histogram()

```
## General Exploratory Data Analysis
```{r  }

ggplot(data = KUbpc.df) + geom_bar(mapping = aes(x = KUbpc.df$doab))

# Date to Doab
date_doab <- KUbpc.df %>% ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$period, colour = KUbpc.df$doab)) + geom_freqpoly(binwidth = 0.1)
ggplotly(date_doab)

# publisher_euro <- KUbpc.df %>% 
# ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$publisher, colour = KUbpc.df$euro)) + geom_freqpoly(binwidth = 0.1)

# Institution to Euro
institution_euro <- KUbpc.df %>% ggplot(data = KUbpc.df, mapping = aes(x = KUbpc.df$euro)) + geom_freqpoly(mapping = aes(colour = KUbpc.df$institution), binwidth = 500)

ggplotly(institution_euro)

```
## Idea: Publishers vs. Charges
## Question: How do the top 25% of publishers divide up charges (in Euro)?
## Observation: Charges are grouped around ~2000 Euros and ~8000 Euros. 
```{r  }

publisher_counts <- KUbpc.df %>%
    group_by(publisher) %>%
    tally

sorted_counts = arrange(publisher_counts, desc(n))

total_n = sum(sorted_counts$n)
quarter_n = 0.25 * total_n
new_n = sum(sorted_counts$n[0:6])

sorted_counts %>% filter(n > 24)

# filtered <- filter(KUbpc.df$publisher %in% sorted_counts$publisher)

filtered <- filter(KUbpc.df, KUbpc.df$publisher == 'transcript Verlag' |
                     KUbpc.df$publisher == 'Duke University Press' |
                     KUbpc.df$publisher == 'University of Michigan Press' |
                     KUbpc.df$publisher == 'Manchester University Press' |
                     KUbpc.df$publisher == 'Pluto Press' |
                     KUbpc.df$publisher == 'Liverpool University Press')

head(filtered)

euro_publisher <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$publisher, y = filtered$euro), 
         aes(x = filtered$publisher, y = filtered$euro)) + 
  # geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  geom_count(aes(color = ..n.., group = euro)) + 
  scale_size_area(max_size = 10) + 
  theme(axis.text = element_text(size = rel(0.75))) +
  labs(title = "How Publishers Divide Charges", x = "Top 25% of Publishers", y = "Price (Euro)", color = 'Number of Copies') +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))

# ggplot:
ggplotly(euro_publisher)

# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$publisher, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  labs(title = "How Publishers Divide Charges", x = "Top 25% of Publishers", y = "Price (Euro)", color = 'Number of Copies') +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))
cross_ft <- bscols(
  filter_select("publisher", "Select a publisher", ft, ~publisher),
  ggplotly(gg_ft, dynamicTicks = TRUE),
  widths = c(12, 12)
)

bscols(cross_ft)

# shared_euro_publisher <- SharedData$new(filtered)
# leaflet(shared_euro_publisher) %>% addMarkers()
# data.table::data.table(shared_euro_publisher)


```
## Idea: Publishers' Charges vs. Year/OA Type
## Sub-Question: What best explains the particular division of charges? (Year, OA Type)
## Observation: The low and high charge groups seem to be defined by the type of OA business model, whereas the slight differences within each group seem to be defined by the year. 
```{r  }

head(filtered)

# Does Type of OA impact the particular division of charges?

euro_oa_publisher <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$backlist_oa, y = filtered$euro), 
         aes(x = filtered$backlist_oa, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., group = euro)) + 
  scale_size_area(max_size = 10) + 
  theme(axis.text = element_text(size = rel(0.75))) +
  labs(title = "How OA Impacts Price Division of Charges", x = "Type of OA", y = "Price (Euro)", color = 'Number of Copies')

# ggplot:
ggplotly(euro_oa_publisher)

# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$backlist_oa, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  labs(title = "How OA Impacts Division of Charges", x = "Type of OA", y = "Price (Euro)", color = 'Number of Copies')
cross_ft <- bscols(
  filter_select("publisher", "Select a publisher", ft, ~publisher),
  ggplotly(gg_ft, dynamicTicks = TRUE),
  widths = c(12, 12)
)

bscols(cross_ft)


# Does Year impact the particular division of charges?

euro_year_publisher <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$period, y = filtered$euro), 
         aes(x = filtered$period, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., group = euro)) + 
  scale_size_area(max_size = 10) + 
  theme(axis.text = element_text(size = rel(0.75))) +
  labs(title = "How Year Impacts Price Division of Charges", x = "Year", y = "Price (Euro)", color = 'Number of Copies')

# ggplot:
ggplotly(euro_year_publisher)

# crosstalk:
ft <- highlight_key(filtered)
gg_ft <- ggplot(data = ft, mapping = aes(x = filtered$period, y = filtered$euro)) + 
  geom_count(aes(color = ..n.., size = after_stat(prop), group = euro)) + 
  labs(title = "How Year Impacts Division of Charges", x = "Year", y = "Price (Euro)", color = 'Number of Copies')
cross_ft <- bscols(
  filter_select("publisher", "Select a publisher", ft, ~publisher),
  ggplotly(gg_ft, dynamicTicks = TRUE),
  widths = c(12, 12)
)

bscols(cross_ft)


```
## Idea: Publishers vs. OA
## Question: What type of business model do the top 25% publishers use?
## Observation: Most have a higher proportion of True (moved to OA from traditional publishing) than False (already published OA).
```{r  }

oa_type <- filtered %>% 
  ggplot(data = filtered, mapping = aes(x = filtered$publisher, colour = filtered$backlist_oa), fill = filtered$backlist_oa) +
  geom_bar(position = "fill", width = 0.7, fill="#EAEAEA") +
  labs(title = "Business Model OA for Publishers", x = "Top 25% of Publishers", y = "Proportion", color = 'Types of OA') +
  theme(axis.text = element_text(size = rel(0.75))) +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17)) +
  scale_color_brewer(palette = "Set1")

ggplotly(oa_type)

# crosstalk:
ft <- highlight_key(filtered)
oa_ft <- ggplot(data = ft, mapping = aes(x = ft$publisher, colour = ft$backlist_oa), fill = ft$backlist_oa) +
  geom_bar(position = "fill", width = 0.7) +
  labs(title = "Business Model OA for Publishers", x = "Top 25% of Publishers", y = "Proportion of Backlist OA", color = 'Types of OA') +
  theme(axis.text = element_text(size = rel(0.75))) +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17))
# cross_oa_ft <- bscols(
#   filter_select("publisher", "Select a publisher", ft, ~publisher),
#   ggplotly(oa_ft, dynamicTicks = TRUE),
#   # widths = c(12, 12)
# )

# bscols(cross_oa_ft)


```
## Idea: Publishers' OA vs. Year
## Question: Did OA business models of the top 25% publishers change per year?
## Observation:
```{r  }

oa_time <- function(pub_name) {
  pub_ft <- filter(filtered, filtered$publisher == pub_name)
  
  pub_oa <- pub_ft %>% 
    ggplot(data = pub_ft, mapping = aes(x = pub_ft$period, colour = pub_ft$backlist_oa), fill = pub_ft$backlist_oa) +
    geom_bar(position = "fill", width = 0.7, fill="#EAEAEA") +
    labs(title = paste(pub_name, "'s OA Through the Years", sep = ""), 
         x = "Years", y = "Proportion of Backlist OA", color = 'Types of OA') +
    theme(axis.text = element_text(size = rel(0.75))) +
    scale_x_discrete(limits=c(2017, 2018, 2019)) +
    scale_color_brewer(palette = "Set1")

  ggplotly(pub_oa)
  
}

top25_list = c("transcript Verlag", "Duke University Press", "University of Michigan Press", "Manchester University Press", "Pluto Press", "Liverpool University Press")

oa_time("transcript Verlag")

oa_time("Duke University Press")

oa_time("University of Michigan Press")

oa_time("Manchester University Press")

oa_time("Pluto Press")

oa_time("Liverpool University Press")

```
## Idea: Revenue vs. OA
## Question: What total revenue are publishers receiving each year?
## Observation: 
```{r  }

# Finding total revenue for each publisher

revenue_finder <- function(pub_name) {
  pub_filtered <- filter(filtered, filtered$publisher == pub_name)
  rev = sum(pub_filtered$euro)
}

revenue_df <- data.frame("publisher" = top25_list)
revenue_list <- c()

for (i in top25_list) {
  revenue_list<-c(revenue_list,revenue_finder(i))
}

revenue_df$revenue <- c(revenue_list)
print(revenue_df)

# ggplot:
publisher_revenue <- revenue_df %>%
  ggplot(data = revenue_df, mapping = aes(x = revenue_df$publisher, y = revenue_df$revenue), fill = revenue_df$revenue) +
  geom_col() +
  labs(title = "Total Revenue for Publishers", x = "Top 25% of Publishers", y = "Revenue (Euro)", color = 'Types of OA') +
  theme(axis.text = element_text(size = rel(0.75))) +
  scale_x_discrete(labels = function(x) str_wrap(str_replace_all(x, "foo", " "), width = 17)) +
  scale_fill_brewer(palette = "Set1")

ggplotly(publisher_revenue)


```
## Idea: Revenue vs. OA
## Question: What revenue are publishers receiving per year?
## Observation: 
```{r  }

# Finding total revenue for each publisher

revlist_2017 <- c()
revlist_2018 <- c()
revlist_2019 <- c()

revlist <- c()

for (name in top25_list) {
  pub_name <- filter(filtered, filtered$publisher == name)
  rev_2017 = sum(pub_name[pub_name$period == 2017,]$euro)
  revlist_2017 <- c(revlist_2017, rev_2017)
  rev_2018 = sum(pub_name[pub_name$period == 2018,]$euro)
  revlist_2018 <- c(revlist_2018, rev_2018)
  rev_2019 = sum(pub_name[pub_name$period == 2019,]$euro) 
  revlist_2019 <- c(revlist_2019, rev_2019)
}

revenue_df <- data.frame("publisher" = top25_list)
revenue_df$'2017' <- c(revlist_2017)
revenue_df$'2018' <- c(revlist_2018)
revenue_df$'2019' <- c(revlist_2019)

print(revenue_df)

revenue_year <- c(revenue_df$'2017', revenue_df$'2018', revenue_df$'2019')
year <- c('2017', '2018', '2019')

# ggplot:
pub_year_revenue1 <- revenue_df %>%
  
  ggplot(data = revenue_df, mapping = aes(x = '2017', y = revenue_df$'2017', fill = revenue_df$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_revenue1)

pub_year_revenue2 <- revenue_df %>%
  
  ggplot(data = revenue_df, mapping = aes(x = '2018', y = revenue_df$'2018', fill = revenue_df$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_revenue2)

pub_year_revenue3 <- revenue_df %>%
  
  ggplot(data = revenue_df, mapping = aes(x = '2019', y = revenue_df$'2019', fill = revenue_df$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_revenue3)


```
### Continued, tried putting it into one graph. 
```{r}

revlist <- c()
revlist_2017 <- c()
revlist_2018 <- c()
revlist_2019 <- c()

for (name in top25_list) {
  pub_name <- filter(filtered, filtered$publisher == name)
  rev_2017 = sum(pub_name[pub_name$period == 2017,]$euro)
  revlist_2017 <- c(revlist_2017, rev_2017)
  rev_2018 = sum(pub_name[pub_name$period == 2018,]$euro)
  revlist_2018 <- c(revlist_2018, rev_2018)
  rev_2019 = sum(pub_name[pub_name$period == 2019,]$euro)
  revlist_2019 <- c(revlist_2019, rev_2019)
}

revlist <- c(revlist_2017, revlist_2018, revlist_2019)

print(revlist)

nrev <- matrix(revlist, ncol=6, byrow=TRUE)
colnames(nrev) <- top25_list
rownames(nrev) <- c("2017", "2018", "2019")
nrev <- as.table(nrev)
nrev <- as.data.frame.matrix(nrev)

print(nrev)

#, nrev$`Duke University Press`, nrev$`University of Michigan Press`, nrev$`Pluto Press`, nrev$`Manchester University Press`, nrev$`Liverpool University Press`

pub_year_rev <- nrev %>%
  
  ggplot(data = nrev, mapping = aes(x = c("2017", "2018", "2019"), y = c(nrev$"transcript Verlag"), fill = nrev$publisher)) +
  geom_bar(position="dodge", stat="identity") +
  labs(title = "Total Revenue for Publishers", x = "Year", y = "Revenue (Euro)", color = 'Publishers') +
  theme(axis.text = element_text(size = rel(0.75)))

ggplotly(pub_year_rev)

```
## Idea: DOAB analysis
## Question: What is the average time gap between year of publication and added on date? 
## Observation: 
```{r}

DOABmeta.df <- filter(DOABmeta.df, is.na(DOABmeta.df$Year.of.publication))
print(DOABmeta.df$Year.of.publication[1:4])
gap = mean(DOABmeta.df$Added.on.date - DOABmeta.df$Year.of.publication[1:3])
print(gap)

```
### Comparison of charges by year and backlist
```{r}
# create faceted plot object
charges.plot <- KUbpc.df %>% ggplot(aes(euro))+geom_histogram(bins=6)+facet_grid(rows=vars(period), cols = vars(backlist_oa))


## Present as Standard plot
 plot(charges.plot)

# this plot will render publicly https://htmlpreview.github.io/?https://github.com/MIT-Informatics/monograph/blob/master/00%20EDA%20Start.nb.html

```
### Interactive charges exploration
```{r}
 ggplotly(charges.plot)
# https://mit-informatics.github.io/monograph/demo.html

```
```
### Interactive Dataset Exploration 
```
```{r}
KUbpc.df %>% ExPanD(df=.       ,title="KU Book Processing Charges",export_nb_option = TRUE)
# ExPanD uses shiny() which works running R locally, but isn't going to work through github. Could publish through shinyapps.io (low usage only), or export  a non-interactive notebook it
# see: https://drmaltman.shinyapps.io/demo/
```

